# -*- coding: utf-8 -*-
"""
Created on Sat Mar 09 15:37:04 2019

@author: Richard
"""

from bs4 import BeautifulSoup
import requests
from datetime import datetime
import calendar

headers = requests.utils.default_headers()

headers.update(
        {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
        }
)

baseURL = 'https://adamsmith.house.gov/press-releases?page='

urlsList = []
errorsList = []
for counter in range(1, 102):
    print "URL counter: "+str(counter)
    targetURL = baseURL+str(counter)
    try: 
        response = requests.get(targetURL, headers=headers)
        text = response.text
        soup = BeautifulSoup(text, 'html5lib')
        for link in soup.findAll('td', {'class': 'recordListTitle'}):
            url = link.find('a').attrs['href']
            pressURL = 'https://adamsmith.house.gov'+str(url)
            urlsList.append(pressURL)
    except:
        print "error has occurred at "+str(counter)
        errorsList.append(targetURL)
        pass

pressList = []
for targetURL in urlsList:
    print "Press Release counter: "+str(counter)
    string = ''
    try:
        response = requests.get(targetURL, headers=headers)
        text = response.text
        soup = BeautifulSoup(text, 'html5lib')
        
        #Grabbing the date-stamp for press release:
        table = soup.find('span', attrs={"class":"date"})
        month = table.find('span', attrs={"class":"month"}).get_text()
        month = calendar.month_abbr [list(calendar.month_name).index(month)]
        month = datetime.strptime(month, '%b').month
        day = table.find('span', attrs={"class":"day"}).get_text()
        year = table.find('span', attrs={"class":"year"}).get_text()
        date = str(month)+"."+str(day)+"."+str(year)
        
        #Grabbing the text for press release:
        a = soup.find('div',attrs={"class":"post-content"})
        
        [x.extract() for x in a.findAll('script')]
        [x.extract() for x in a.findAll('style')]
        [x.extract() for x in a.findAll("span", {'class':'hidden'})]  
        [x.extract() for x in a.findAll("div", {'class':'hidden'})]  
        
        string += a.get_text()
    
    except:
        print "error has occurred with url: "+str(targetURL)
        errorsList.append(targetURL)
        pass

    string = string.replace('\r', ' ').replace('\n', ' ').replace('\t',' ')
    string = string.strip()
    string = ' '.join(string.split())
    pressList.append(date+'\t'+string)
    counter += 1

text_file = open("adamSmithPress.txt", "w")
for press in pressList:
    text_file.write(press.encode('utf-8')+"\n")
text_file.close()

text_file = open("adamSmithPressErrors.txt", "w")
for error in errorsList:
    text_file.write(error.encode('utf-8')+"\n")
text_file.close()